/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/x64/jit_generator.hpp"

#include "cpu/x64/gemm/f32/common_f32.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace x64 {

jit_sse41_kernel_b0_sgemm_kern_t::jit_sse41_kernel_b0_sgemm_kern_t()
    : jit_generator_t(jit_name()) {}

void jit_sse41_kernel_b0_sgemm_kern_t::generate() {

#ifndef _WIN32

#define M rdi
#define N rsi
#define K rdx
#define A r8
#define B r9
#define C rcx
#define LDC r10

#define AA r15
#define I r11
#define J r12
#define H rax
#define AO rbx
#define BO rbp
#define CO1 r13
#define CO2 r14

#define OLD_C (8 + stacksize + rsp)
#define OLD_LDC (16 + stacksize + rsp)

#else

#define M rcx
#define N rdx
#define K r8
#define A rdi
#define B rsi
#define C r9
#define LDC r10
#define AA r15
#define I r11
#define J r12
#define H rax
#define AO rbx
#define BO rbp
#define CO1 r13
#define CO2 r14

#define OLD_A 40 + stacksize + rsp
#define OLD_B 48 + stacksize + rsp
#define OLD_C 56 + stacksize + rsp
#define OLD_LDC 64 + stacksize + rsp

#endif

    inLocalLabel();
    {
        std::vector<Xbyak::Label> labels(93);
        preamble();
        auto stacksize = get_size_of_abi_save_regs();
#ifdef _WIN32
        mov(A, ptr[OLD_A]);
        mov(B, ptr[OLD_B]);
#endif
        mov(C, ptr[OLD_C]);
        mov(LDC, ptr[OLD_LDC]);

        mov(M, qword[M]);
        mov(N, qword[N]);
        mov(K, qword[K]);
        shl(LDC, 0x2);
        sub(A, -128);
        sub(B, -128);
        mov(J, M);
        cmp(J, 0x8);
        jl(labels[90], T_NEAR);
        align(4);

        L(labels[72]);
        mov(AA, K);
        imul(AA, AA, 0x20);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x20);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(labels[73], T_NEAR);
        align(4);

        L(labels[75]);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[69], T_NEAR);
        sub(H, 0x1e);
        jle(labels[57], T_NEAR);
        align(4);

        L(labels[85]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -64);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[85], T_NEAR);
        align(4);

        L(labels[57]);
        prefetcht0(byte[CO1 + 0x1c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
        prefetcht0(byte[CO2 + 0x1c]);
        prefetcht0(byte[CO2 + LDC * 1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(labels[61]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -64);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[61], T_NEAR);
        align(4);

        L(labels[69]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[71], T_NEAR);
        align(4);

        L(labels[70]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -16);
        dec(H);
        jg(labels[70], T_NEAR);
        align(4);

        L(labels[71]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movaps(xmm14, xmm1);
        movaps(xmm15, xmm1);
        shufps(xmm14, xmm0, 0xcc);
        shufps(xmm15, xmm0, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
        movups(xword[CO2], xmm10);
        movups(xword[CO2 + 0x10], xmm14);
        movups(xword[CO2 + LDC * 1], xmm11);
        movups(xword[CO2 + LDC * 1 + 0x10], xmm15);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(labels[75], T_NEAR);
        align(4);

        L(labels[73]);
        test(I, 0x2);
        jle(labels[81], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[78], T_NEAR);
        sub(H, 0x1e);
        jle(labels[76], T_NEAR);
        align(4);

        L(labels[74]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -32);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[74], T_NEAR);
        align(4);

        L(labels[76]);
        prefetcht0(byte[CO1 + 0x1c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(labels[77]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -32);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[77], T_NEAR);
        align(4);

        L(labels[78]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[80], T_NEAR);
        align(4);

        L(labels[79]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -8);
        dec(H);
        jg(labels[79], T_NEAR);
        align(4);

        L(labels[80]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(labels[81]);
        test(I, 0x1);
        jle(labels[89], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[86], T_NEAR);
        sub(H, 0x1e);
        jle(labels[83], T_NEAR);
        align(4);

        L(labels[82]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -16);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[82], T_NEAR);
        align(4);

        L(labels[83]);
        prefetcht0(byte[CO1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(labels[84]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -16);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[84], T_NEAR);
        align(4);

        L(labels[86]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[88], T_NEAR);
        align(4);

        L(labels[87]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -4);
        dec(H);
        jg(labels[87], T_NEAR);
        align(4);

        L(labels[88]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(labels[89]);
        mov(A, AO);
        sub(J, 0x8);
        cmp(J, 0x8);
        jge(labels[72], T_NEAR);
        align(4);

        L(labels[90]);
        test(J, 0x4);
        jle(labels[20], T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x10);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x10);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(labels[5], T_NEAR);
        align(4);

        L(labels[91]);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[2], T_NEAR);
        sub(H, 0x1e);
        jle(labels[0], T_NEAR);
        align(4);

        L(labels[92]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[92], T_NEAR);
        align(4);

        L(labels[0]);
        prefetcht0(byte[CO1 + 0xc]);
        prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
        prefetcht0(byte[CO2 + 0xc]);
        prefetcht0(byte[CO2 + LDC * 1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(labels[1]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[1], T_NEAR);
        align(4);

        L(labels[2]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[4], T_NEAR);
        align(4);

        L(labels[3]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -16);
        dec(H);
        jg(labels[3], T_NEAR);
        align(4);

        L(labels[4]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO2], xmm10);
        movups(xword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(labels[91], T_NEAR);
        align(4);

        L(labels[5]);
        test(I, 0x2);
        jle(labels[12], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[9], T_NEAR);
        sub(H, 0x1e);
        jle(labels[7], T_NEAR);
        align(4);

        L(labels[6]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[6], T_NEAR);
        align(4);

        L(labels[7]);
        prefetcht0(byte[CO1 + 0xc]);
        prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(labels[8]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[8], T_NEAR);
        align(4);

        L(labels[9]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[11], T_NEAR);
        align(4);

        L(labels[10]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -8);
        dec(H);
        jg(labels[10], T_NEAR);
        align(4);

        L(labels[11]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(labels[12]);
        test(I, 0x1);
        jle(labels[19], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[16], T_NEAR);
        sub(H, 0x1e);
        jle(labels[14], T_NEAR);
        align(4);

        L(labels[13]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[13], T_NEAR);
        align(4);

        L(labels[14]);
        prefetcht0(byte[CO1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(labels[15]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[15], T_NEAR);
        align(4);

        L(labels[16]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[18], T_NEAR);
        align(4);

        L(labels[17]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -4);
        dec(H);
        jg(labels[17], T_NEAR);
        align(4);

        L(labels[18]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(labels[19]);
        mov(A, AO);
        align(4);

        L(labels[20]);
        test(J, 0x2);
        jle(labels[43], T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x8);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x8);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(labels[28], T_NEAR);
        align(4);

        L(labels[21]);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[25], T_NEAR);
        sub(H, 0x1e);
        jle(labels[23], T_NEAR);
        align(4);

        L(labels[22]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[22], T_NEAR);
        align(4);

        L(labels[23]);
        prefetcht0(byte[CO1 + 0x4]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
        prefetcht0(byte[CO2 + 0x4]);
        prefetcht0(byte[CO2 + LDC * 1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(labels[24]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[24], T_NEAR);
        align(4);

        L(labels[25]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[27], T_NEAR);
        align(4);

        L(labels[26]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -16);
        dec(H);
        jg(labels[26], T_NEAR);
        align(4);

        L(labels[27]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
        movlps(qword[CO2], xmm10);
        movlps(qword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(labels[21], T_NEAR);
        align(4);

        L(labels[28]);
        test(I, 0x2);
        jle(labels[35], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[32], T_NEAR);
        sub(H, 0x1e);
        jle(labels[30], T_NEAR);
        align(4);

        L(labels[29]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[29], T_NEAR);
        align(4);

        L(labels[30]);
        prefetcht0(byte[CO1 + 0x4]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(labels[31]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[31], T_NEAR);
        align(4);

        L(labels[32]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[34], T_NEAR);
        align(4);

        L(labels[33]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -8);
        dec(H);
        jg(labels[33], T_NEAR);
        align(4);

        L(labels[34]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(labels[35]);
        test(I, 0x1);
        jle(labels[42], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[39], T_NEAR);
        sub(H, 0x1e);
        jle(labels[37], T_NEAR);
        align(4);

        L(labels[36]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[36], T_NEAR);
        align(4);

        L(labels[37]);
        prefetcht0(byte[CO1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(labels[38]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[38], T_NEAR);
        align(4);

        L(labels[39]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[41], T_NEAR);
        align(4);

        L(labels[40]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -4);
        dec(H);
        jg(labels[40], T_NEAR);
        align(4);

        L(labels[41]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(labels[42]);
        mov(A, AO);
        align(4);

        L(labels[43]);
        test(J, 0x1);
        jle(labels[68], T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x4);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x4);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(labels[51], T_NEAR);
        align(4);

        L(labels[44]);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[48], T_NEAR);
        sub(H, 0x1e);
        jle(labels[46], T_NEAR);
        align(4);

        L(labels[45]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[45], T_NEAR);
        align(4);

        L(labels[46]);
        prefetcht0(byte[CO1 + 0x0]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
        prefetcht0(byte[CO2]);
        prefetcht0(byte[CO2 + LDC * 1]);
        add(H, 0x1e);
        align(4);

        L(labels[47]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[47], T_NEAR);
        align(4);

        L(labels[48]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[50], T_NEAR);
        align(4);

        L(labels[49]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -16);
        dec(H);
        jg(labels[49], T_NEAR);
        align(4);

        L(labels[50]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
        movss(dword[CO2], xmm10);
        movss(dword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(labels[44], T_NEAR);
        align(4);

        L(labels[51]);
        test(I, 0x2);
        jle(labels[59], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[55], T_NEAR);
        sub(H, 0x1e);
        jle(labels[53], T_NEAR);
        align(4);

        L(labels[52]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[52], T_NEAR);
        align(4);

        L(labels[53]);
        prefetcht0(byte[CO1 + 0x0]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
        add(H, 0x1e);
        align(4);

        L(labels[54]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[54], T_NEAR);
        align(4);

        L(labels[55]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[58], T_NEAR);
        align(4);

        L(labels[56]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -8);
        dec(H);
        jg(labels[56], T_NEAR);
        align(4);

        L(labels[58]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(labels[59]);
        test(I, 0x1);
        jle(labels[67], T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(labels[64], T_NEAR);
        sub(H, 0x1e);
        jle(labels[62], T_NEAR);
        align(4);

        L(labels[60]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[60], T_NEAR);
        align(4);

        L(labels[62]);
        prefetcht0(byte[CO1 + 0x0]);
        add(H, 0x1e);
        align(4);

        L(labels[63]);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(labels[63], T_NEAR);
        align(4);

        L(labels[64]);
        mov(H, K);
        and_(H, 0x3);
        je(labels[66], T_NEAR);
        align(4);

        L(labels[65]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -4);
        dec(H);
        jg(labels[65], T_NEAR);
        align(4);

        L(labels[66]);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(labels[67]);
        mov(A, AO);
        align(4);

        L(labels[68]);

        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef OLD_A
#undef OLD_B
#endif
#undef OLD_C
#undef OLD_LDC
}

} // namespace x64
} // namespace cpu
} // namespace impl
} // namespace dnnl
